Applying models to Nanopore data#
Step 1: Read output from modkit#
Show code cell source
path = r"../Data/Nanopore_data/Kasumi1-naive-p2solo.bed/Kasumi1-naive-p2solo.bed"
sample_name = 'Kasumi1-naive'
import pandas as pd
# Column names for the two parts
columns_part1 = ["chrom", "start_position", "end_position", "modified_base_code", "score", "strand", "start_position_compat", "end_position_compat", "color"]
columns_part2 = ["N_valid_cov", "fraction_modified", "N_mod", "N_canonical", "N_other_mod", "N_delete", "N_fail", "N_diff", "N_nocall"]
# Read the first part with '\t' delimiter
df_part1 = pd.read_csv(path, sep='\t', header=None, usecols=range(9), names=columns_part1)
# Read the second part with ' ' delimiter
df_part2 = pd.read_csv(path, delim_whitespace=True, header=None, usecols=range(9, 18), names=columns_part2)
# Concatenate the two dataframes horizontally
df = pd.concat([df_part1, df_part2], axis=1)
# Filter the dataframe by modified base code
df_5mc = df[df["modified_base_code"] == "m"]
df_5hmc = df[df["modified_base_code"] == "h"]
df['modified_base_code'].value_counts(dropna=False)
modified_base_code
h 28450341
m 28450341
Name: count, dtype: int64
Show code cell source
df_5mc['score'].mean()
13.171531265653371
Step 2: Load CpG probe array coordinates for build 38#
Show code cell source
# Load the list of suboptimal probes
zhou2016_probes = pd.read_csv('../Data/UnreliableProbesList_Zhou2016/EPIC.anno.GRCh38.tsv', sep='\t')
# Add a column with the coordinate of the probe
df_5mc['coordinate'] = df_5mc['chrom'].astype(str) + ':' + df_5mc['start_position'].astype(str)
zhou2016_probes['coordinate'] = zhou2016_probes['chrm'].astype(str) + ':' + zhou2016_probes['start'].astype(str)
# Merge the two dataframes
df_5mc_merged = pd.merge(df_5mc, zhou2016_probes[['probeID','coordinate']], on='coordinate', how='inner')
# Set the index to the probeID
df_5mc_merged = df_5mc_merged.set_index('probeID')
# Create beta values column
df_5mc_merged[sample_name] = (df_5mc_merged['fraction_modified'] / 100).round(3)
# Create a new dataframe with only the beta values
df_nanopore = df_5mc_merged[[sample_name]].T
df_nanopore
/tmp/ipykernel_290483/610380037.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_5mc['coordinate'] = df_5mc['chrom'].astype(str) + ':' + df_5mc['start_position'].astype(str)
| probeID | cg14817997 | cg26928153 | cg16269199 | cg13869341 | cg14008030 | cg12045430 | cg20826792 | cg18231760 | cg02404219 | cg21870274 | ... | cg00180869 | cg05964935 | cg00618696 | cg25363292 | cg12170560 | cg08890132 | cg00491786 | cg26559055 | cg07587934 | cg16855331 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Kasumi1-naive | 1.0 | 1.0 | 1.0 | 0.938 | 0.941 | 0.0 | 0.0 | 0.5 | 1.0 | 1.0 | ... | 0.867 | 0.667 | 0.901 | 0.92 | 0.971 | 0.953 | 0.714 | 1.0 | 0.967 | 0.734 |
1 rows × 860482 columns
Step 3: Merge results with train and test array data#
Load array data#
Show code cell source
import pandas as pd
import numpy as np
from source.pacmap_functions import *
input_path = '../Data/Intermediate_Files/'
output_path = '../Data/Processed_Data/'
# read df_discovery and df_validation
df_discovery = pd.read_pickle(
input_path+'df_discovery.pkl').sort_index()
# Load clinical data
discovery_clinical_data = pd.read_csv(input_path+'discovery_clinical_data.csv',
low_memory=False, index_col=0)
# Create a new dataframe to contain metadata for `df_nanopore`, which represents sample 'UF_hem_1832_PB'
validation_clinical_data = pd.DataFrame(index=df_nanopore.index)
# Adjust clinical data
discovery_clinical_data['Train Test'] = 'Discovery (train) Samples'
validation_clinical_data['Train Test'] = sample_name
discovery_clinical_data['PaCMAP Output'] = 'Patient Samples'
validation_clinical_data['PaCMAP Output'] = 'Patient Samples'
discovery_clinical_data['Batch'] = df_discovery['Batch']
validation_clinical_data['Batch'] = 'Nanopore'
# add the following columns to `validation_clinical_data`: 'Clinical Trial', 'Sample Type', 'Patient_ID', 'ELN AML 2022 Diagnosis', 'Hematopoietic Lineage'
validation_clinical_data['Clinical Trial'] = 'UF Hem Bank'
validation_clinical_data['Sample Type'] = 'Peripheral Blood'
validation_clinical_data['Patient_ID'] = sample_name
validation_clinical_data['ELN AML 2022 Diagnosis'] = np.nan
validation_clinical_data['Hematopoietic Lineage'] = np.nan
Select CpGs in both train and nanopore sample#
Show code cell source
# use overlapping features between df_discovery and df_nanopore
common_features = [x for x in df_discovery.columns if x in df_nanopore.columns]
# apply `common_features` to both df_discovery and df_nanopore
df_discovery = df_discovery[common_features]
df_nanopore = df_nanopore[common_features]
print(
f' Discovery dataset (df_discovery) contains {df_discovery.shape[1]} \
columns (5mC nucleotides/probes) and {df_discovery.shape[0]} rows (samples).')
print(
f' Validation dataset (df_nanopore) contains {df_nanopore.shape[1]} \
columns (5mC nucleotides/probes) and {df_nanopore.shape[0]} rows (samples).')
output_notebook()
df_validation = df_nanopore.copy()
# Set the theme for the plot
curdoc().theme = 'light_minimal' # or 'dark_minimal'
Show code cell source
clinical_trials = ['AAML0531', 'AAML1031', 'AAML03P1', 'CCG2961', 'Japanese AML05']
sample_types = ['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow', 'Bone Marrow Normal',
'Primary Blood Derived Cancer - Peripheral Blood', 'Blood Derived Normal']
cols = ['Clinical Trial', 'Sample Type', 'Patient_ID', 'ELN AML 2022 Diagnosis', 'Train Test', 'Batch']
components = [2]
for n in components:
processor = DataProcessor(discovery_clinical_data.copy(),
df_discovery,
clinical_trials,
sample_types,
cols,
n_components=n,
common_prefix=output_path+f'pacmap_output/pacmap_{n}d_model_peds_dx_aml',
df_test=df_validation.copy(),
test_clinical_data=validation_clinical_data.copy())
processor.filter_data()
processor.apply_pacmap() # learn PaCMAP on the training data
processor.apply_pacmap_test() # apply PaCMAP to the test data
processor.join_labels() # join clinical data to the embedding
df = processor.df.copy()
# # Save output
# processor.df.to_csv(output_path+f'pacmap_output/pacmap_{n}d_model_peds_dx_aml.csv')
The PaCMAP instance is successfully saved at ../Data/Processed_Data/pacmap_output/pacmap_2d_model_peds_dx_aml.pkl.
To load the instance again, please do `pacmap.load(../Data/Processed_Data/pacmap_output/pacmap_2d_model_peds_dx_aml)`.
Plot Sample in AML Map#
Show code cell source
# Concatenate discovery and validation clinical data
clinical_data = pd.concat([discovery_clinical_data, validation_clinical_data]).loc[df['index']]
# Select columns to plot
cols = ['PaCMAP Output','Hematopoietic Lineage','WHO AML 2022 Diagnosis','ELN AML 2022 Diagnosis', 'FAB', 'FLT3 ITD', 'Age (group years)',
'Complex Karyotype', 'Primary Cytogenetic Code' ,'Batch', 'Sex', 'MRD 1 Status',
'Leucocyte counts (10⁹/L)', 'Risk Group', 'Race or ethnic group',
'Clinical Trial', 'Vital Status','First Event','Sample Type', 'Train Test']
# Join clinical data to the embedding
df = df.join(clinical_data[cols], rsuffix='_copy', on='index')
plotter = BokehPlotter(df, cols, get_custom_color_palette(),
title='The Methylome Atlas of Pediatric AML',
x_range=(-45, 45), y_range=(-45, 45),
datapoint_size=3, tooltip_dx_cols='ELN AML 2022 Diagnosis',
width=1000, height=500)
plotter.plot()